# -*- coding: utf-8 -*-
"""
删除标点、停用词
"""

import re
from zhon.hanzi import punctuation

def is_chinese(word):
    """判断是否是汉字"""
    if '\u4e00' <= word <= '\u9fff':
        return True
    return False

def remove_punctuation(string, keep_words=True):
    """
    删除文本中的中文标点
    keep_words:
        False:删除字符串中所有中文标点符号
        True: 删除字符串中的标点，但是保留书名号和点号（外国人名）
    """
    punck = punctuation
    if keep_words:
        punck = punck.replace('《', '')
        punck = punck.replace('》', '')
        punck = punck.replace('·', '')
    string = re.sub(u"[%s]" % punck, " ", string)
    return string.strip()


def remove_stopwords(segwords, stopwords=[]):
    """去除停用词"""
    words = [i for i in segwords if i not in stopwords]
    return ' '.join(words)

